Library

# Loading libraries
library(tidyverse)
library(readxl)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(reshape2)
library(viridisLite)
library(unikn)
library(readr)
library(caret)
library(class)
library(zoo)
library(fastDummies)
library(randomForest)
library(e1071)
library(xgboost)
library(ipred)
library(rpart)
library(plotly)

Uploading Data

# Uploading data
data1 <- read_csv("kidney.csv")
colnames(data1) <- c('ID', 'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
                     'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
                     'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
                     'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
                     'anemia', 'class')

data <- as.data.frame(data1)
data_clean <- na.omit(data)

Change Variables from Objects to Numerical

# Change variables from objects to numerical
data$white_blood_cell_count <- as.numeric(data$white_blood_cell_count)
data$packed_cell_volume <- as.numeric(data$packed_cell_volume)
data$ID <- as.factor(data$ID)
# Replace values in the 'class' column
data$class <- ifelse(data$class == 'ckd', 0, 1)
head(data)
##   ID age blood_pressure specific_gravity albumin sugar red_blood_cells pus_cell
## 1  0  48             80            1.020       1     0            <NA>   normal
## 2  1   7             50            1.020       4     0            <NA>   normal
## 3  2  62             80            1.010       2     3          normal   normal
## 4  3  48             70            1.005       4     0          normal abnormal
## 5  4  51             80            1.010       2     0          normal   normal
## 6  5  60             90            1.015       3     0            <NA>     <NA>
##   pus_cell_clumps   bacteria blood_glucose_random blood_urea serum_creatinine
## 1      notpresent notpresent                  121         36              1.2
## 2      notpresent notpresent                   NA         18              0.8
## 3      notpresent notpresent                  423         53              1.8
## 4         present notpresent                  117         56              3.8
## 5      notpresent notpresent                  106         26              1.4
## 6      notpresent notpresent                   74         25              1.1
##   sodium potassium haemoglobin packed_cell_volume white_blood_cell_count
## 1     NA        NA        15.4                 44                   7800
## 2     NA        NA        11.3                 38                   6000
## 3     NA        NA         9.6                 31                   7500
## 4    111       2.5        11.2                 32                   6700
## 5     NA        NA        11.6                 35                   7300
## 6    142       3.2        12.2                 39                   7800
##   red_blood_cell_count hypertension diabetes_mellitus coronary_artery_disease
## 1                  5.2          yes               yes                      no
## 2                 <NA>           no                no                      no
## 3                 <NA>           no               yes                      no
## 4                  3.9          yes                no                      no
## 5                  4.6           no                no                      no
## 6                  4.4          yes               yes                      no
##   appetite peda_edema anemia class
## 1     good         no     no     0
## 2     good         no     no     0
## 3     poor         no    yes     0
## 4     poor        yes    yes     0
## 5     good         no     no     0
## 6     good        yes     no     0

Extracting Categorical and Numerical Columns

cat_cols <- colnames(data)[sapply(data, is.character)]
num_cols <- colnames(data)[sapply(data, is.numeric)]
## Replace incorrect values
data <- data %>% 
  mutate(diabetes_mellitus = ifelse(diabetes_mellitus %in% c('yes', 'no'), diabetes_mellitus, NA))
data$class <- as.numeric(data$class)

Checking numerical features distribution

ggplot(data = gather(data, key = "Feature", value = "Value", num_cols), aes(x = Value, fill = Feature)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~Feature, scales = "free") +
  theme_minimal() +
  scale_fill_manual(values = cm.colors(length(num_cols))) +
  labs(title = "Numeric variables distribution", x = "Value", y = "Density")

##Looking at categorical columns

# Define the color palette
my_colors <- hcl.colors(3, palette = "Cold")

# Define the categorical columns to plot
cat_cols_to_plot <- c("red_blood_cells", "pus_cell", "pus_cell_clumps", "bacteria",
                      "hypertension", "diabetes_mellitus", "coronary_artery_disease",
                      "appetite", "peda_edema", "anemia")

# Create a list to store the plots
plot_list <- list()

# Generate the plots and store them in the list
for (column in cat_cols_to_plot) {
  # Filter values excluding NaN and empty values
  filtered_data <- data %>%
    filter(!is.na(!!sym(column)) & !!sym(column) != "") %>%
    select("class", column)
  
  p <- ggplot(filtered_data, aes_string(x = column, fill = column)) +
    geom_bar() +
    theme_minimal() +
    labs(title = column) +
    theme(legend.position = "none") +
    scale_fill_manual(values = my_colors)  # Color palette
  plot_list[[column]] <- p
}

# Combine the plots into a single grid and display them in RStudio
combined_plot <- grid.arrange(grobs = plot_list, ncol = 4)

print(combined_plot)
## TableGrob (3 x 4) "arrange": 10 grobs
##                          z     cells    name           grob
## red_blood_cells          1 (1-1,1-1) arrange gtable[layout]
## pus_cell                 2 (1-1,2-2) arrange gtable[layout]
## pus_cell_clumps          3 (1-1,3-3) arrange gtable[layout]
## bacteria                 4 (1-1,4-4) arrange gtable[layout]
## hypertension             5 (2-2,1-1) arrange gtable[layout]
## diabetes_mellitus        6 (2-2,2-2) arrange gtable[layout]
## coronary_artery_disease  7 (2-2,3-3) arrange gtable[layout]
## appetite                 8 (2-2,4-4) arrange gtable[layout]
## peda_edema               9 (3-3,1-1) arrange gtable[layout]
## anemia                  10 (3-3,2-2) arrange gtable[layout]

##Heatmap of data

numeric_data <- data[sapply(data, is.numeric)]

##Correlation matrix calculation
heatmap_data <- cor(numeric_data, use = "complete.obs")

ggplot(data = melt(heatmap_data), aes(Var2, Var1, fill = value)) +
  geom_tile() +
  theme_minimal() +
  scale_fill_viridis_c(option = "viridis") +  # Palette
  geom_text(aes(label = round(value, 2)), vjust = 1) +
  labs(title = "Correlation Heatmap") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  # Vertical titles

EXPLORATORY DATA ANALYSIS

DENSITY PLOT AND VIOLIN PLOT FOR AGE

ggplot(data, aes(x = age, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
  labs(title = "Distribution of Age by CKD Status", x = "Age", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Violin plot for the "age" variable differentiating between CKD and NotCKD
ggplot(data, aes(x = factor(class), y = age, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of Age by CKD Status", x = "CKD Status", y = "Age") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2), 
        panel.grid.minor = element_blank())

DENSITY PLOT AND VIOLIN PLOT FOR ALBUMIN

ggplot(data, aes(x = albumin, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD"))  +
  labs(title = "Distribution of Albumin by CKD Status", x = "Albumin", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Violin plot with boxplot for the "Albumin" variable differentiating between CKD and NotCKD
ggplot(data, aes(x = factor(class), y = albumin, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of Albumin by CKD Status", x = "CKD Status", y = "Albumin") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2), 
        panel.grid.minor = element_blank())

DENSITY PLOT AND VIOLIN PLOT FOR BLOOD PRESSURE

# Blood Pressure
# Density plot for the "blood_pressure" variable differentiating between CKD and NotCKD
ggplot(data, aes(x = blood_pressure, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD"))  +
  labs(title = "Distribution of Blood Pressure by CKD Status", x = "Blood Pressure", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Violin plot with boxplot for the "blood_pressure" variable differentiating between CKD and NotCKD
ggplot(data, aes(x = factor(class), y = blood_pressure, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of Blood Pressure by CKD Status", x = "CKD Status", y = "Blood Pressure") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2), 
        panel.grid.minor = element_blank())

DENSITY PLOT AND VIOLIN PLOT FOR BLOOD GLUCOSE RANDOM

# Remove rows with missing values in the "blood_glucose_random" variable
data_cleaned_blood_glucose <- data[!is.na(data$blood_glucose_random), ]


ggplot(data_cleaned_blood_glucose, aes(x = blood_glucose_random, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD"))  +
  labs(title = "Distribution of Blood Glucose (Random) by CKD Status", x = "Blood Glucose (Random)", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Remove rows with missing values in the "blood_glucose_random" variable
data_cleaned_blood_glucose <- data[!is.na(data$blood_glucose_random), ]

# Violin plot with box plot for the "blood_glucose_random" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_blood_glucose, aes(x = factor(class), y = blood_glucose_random, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of Blood Glucose (Random) by CKD Status", x = "CKD Status", y = "Blood Glucose (Random)") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

DENSITY PLOT AND VIOLIN PLOT FOR BLOOD UREA

# Remove rows with missing values in the "blood_urea" variable
data_cleaned_blood_urea <- data[!is.na(data$blood_urea), ]
# Density plot for the "blood_urea" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_blood_urea, aes(x = blood_urea, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD")) +
  labs(title = "Distribution of Blood Urea by CKD Status", x = "Blood Urea", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Remove rows with missing values in the "blood_urea" variable
data_cleaned_blood_urea <- data[!is.na(data$blood_urea), ]

# Violin plot with inner box plot for the "blood_urea" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_blood_urea, aes(x = factor(class), y = blood_urea, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of Blood Urea by CKD Status", x = "CKD Status", y = "Blood Urea") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

DENSITY PLOT AND VIOLIN PLOT FOR HEMOGLOBIN

# Remove rows with missing values in the "haemoglobin" variable
data_cleaned_haemoglobin <- data[!is.na(data$haemoglobin), ]
# Density plot for the "haemoglobin" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_haemoglobin, aes(x = haemoglobin, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD"))  +
  labs(title = "Distribution of Hemoglobin by CKD Status", x = "Haemoglobin", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Violin plot with inner box plot for the "haemoglobin" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_haemoglobin, aes(x = factor(class), y = haemoglobin, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#FF6F61","#6F8A91"), labels = c("CKD", "NotCKD" )) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of Hemoglobin by CKD Status", x = "CKD Status", y = "Haemoglobin") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

DENSITY PLOT AND VIOLIN PLOT FOR SERUM CREATININE

# Remove rows with missing values in the "serum_creatinine" variable
data_cleaned_serum_creatinine <- data[!is.na(data$serum_creatinine), ]

# Density plot for the "serum_creatinine" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_serum_creatinine, aes(x = serum_creatinine, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("red", "blue"), labels = c("CKD", "NotCKD"))  +
  labs(title = "Distribution of Hemoglobin by CKD Status", x = "serum_creatinine", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Violin plot with inner box plot for the "serum_creatinine" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_serum_creatinine, aes(x = factor(class), y = serum_creatinine, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#FF6F61", "#6F8A91"), labels = c("CKD", "NotCKD")) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of Serum Creatinine by CKD Status", x = "CKD Status", y = "Serum Creatinine") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

DENSITY PLOT AND VIOLIN PLOT FOR SPECIFIC GRAVITY

# Remove rows with missing values in the "specific_gravity" variable
data_cleaned_specific_gravity <- data[!is.na(data$specific_gravity), ]
# Density plot for the "specific_gravity" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_specific_gravity, aes(x = specific_gravity, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("blue", "red"), labels = c("NotCKD", "CKD")) +
  labs(title = "Distribution of Specific Gravity by CKD Status", x = "Specific Gravity", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Violin plot with inner box plot for the "specific_gravity" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_specific_gravity, aes(x = factor(class), y = specific_gravity, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#6F8A91", "#FF6F61"), labels = c("NotCKD", "CKD")) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of Specific Gravity by CKD Status", x = "CKD Status", y = "Specific Gravity") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

DENSITY PLOT AND VIOLIN PLOT FOR WHITE BLOOD CELLS

# Remove rows with missing values in the "white_blood_cell_count" variable
data_cleaned_white_blood_cell_count <- data[!is.na(data$white_blood_cell_count), ]

# Density plot for the "white_blood_cell_count" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_white_blood_cell_count, aes(x = white_blood_cell_count, fill = factor(class))) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("blue", "red"), labels = c("NotCKD", "CKD")) +
  labs(title = "Distribution of White Blood Cell Count by CKD Status", x = "White Blood Cell Count", y = "Density") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

# Violin plot with inner box plot for the "white_blood_cell_count" variable differentiating between CKD and NotCKD
ggplot(data_cleaned_white_blood_cell_count, aes(x = factor(class), y = white_blood_cell_count, fill = factor(class))) +
  geom_violin(trim = FALSE, size = 0.5, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white", outlier.shape = NA) +
  scale_fill_manual(values = c("#6F8A91", "#FF6F61"), labels = c("NotCKD", "CKD")) +
  scale_x_discrete(labels = c("CKD" = "CKD", "NotCKD" = "NotCKD")) +
  labs(title = "Distribution of White Blood Cell Count by CKD Status", x = "CKD Status", y = "White Blood Cell Count") +
  theme_minimal() +
  theme(panel.grid.major = element_line(color = "gray", size = 0.2),
        panel.grid.minor = element_blank())

DATA PREPROCESSING

data1 <- read_csv("kidney.csv")
colnames(data1) <- c('ID', 'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
                     'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
                     'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
                     'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'pedal_edema',
                     'anemia', 'class')

data <- as.data.frame(data1)

data$white_blood_cell_count <- as.numeric(data$white_blood_cell_count)
data$packed_cell_volume <- as.numeric(data$packed_cell_volume)
data$red_blood_cell_count <-as.numeric(data$red_blood_cell_count)
data$ID <- as.factor(data$ID)

Dividing dataframe in categorical and numeric variables

data_numeric <- data %>% select_if(is.numeric)  # Select numeric variables
data_categorical <- data %>% select_if(is.character)  # Select categorical variables

PREPROCESSING NUMERIC DATA

# Calculate the mean of each column
column_means <- colMeans(data_numeric, na.rm = TRUE)

# Find the positions of NA cells
nas <- is.na(data_numeric)

# Impute NA values with the respective means
data_numeric[nas] <- column_means[rep(1, sum(nas))]

PREPROCESSING CATEGORICAL DATA

data_categorical <- data_categorical %>% 
  mutate(
    red_blood_cells = if_else(red_blood_cells == "normal", 0, 1),
    pus_cell = if_else(pus_cell == "normal", 0, 1),
    pus_cell_clumps = if_else(pus_cell_clumps == "notpresent", 0, 1),
    bacteria = if_else(bacteria == "notpresent", 0, 1),
    hypertension = if_else(hypertension == "no", 0, 1),
    diabetes_mellitus = if_else(diabetes_mellitus == "no", 0, 1),
    coronary_artery_disease = if_else(coronary_artery_disease == "no", 0, 1),
    appetite = if_else(appetite == "good", 0, 1),
    pedal_edema = if_else(pedal_edema == "no", 0, 1),
    anemia = if_else(anemia == "no", 0, 1),
    class = if_else(class == "notckd", 0, 1)
  )

# Replace NA with mode (0 or 1)
data_categorical <- data_categorical %>%
  mutate_all(~ ifelse(is.na(.), ifelse(sum(. == 0, na.rm = TRUE) >= sum(. == 1, na.rm = TRUE), 0, 1), .))

# List of binary column names
column_names <- c(
  "red_blood_cells", "pus_cell", "pus_cell_clumps", "bacteria",
  "hypertension", "diabetes_mellitus", "coronary_artery_disease",
  "appetite", "pedal_edema", "anemia", "class"
)

# Convert numeric columns to factors
data_categorical[column_names] <- lapply(data_categorical[column_names], as.factor)

summary(data_categorical)
##  red_blood_cells pus_cell pus_cell_clumps bacteria hypertension
##  0:353           0:324    0:358           0:378    0:253       
##  1: 47           1: 76    1: 42           1: 22    1:147       
##  diabetes_mellitus coronary_artery_disease appetite pedal_edema anemia  class  
##  0:263             0:366                   0:318    0:324       0:340   0:150  
##  1:137             1: 34                   1: 82    1: 76       1: 60   1:250
# Create dummy variables for categorical columns in data_categorical
data_dummy <- dummy_cols(data_categorical, select_columns = column_names)

# Select the first 11 variables of the data_dummy DataFrame
data_dummy_subset <- data_dummy[, 1:11]
summary(data_dummy_subset)
##  red_blood_cells pus_cell pus_cell_clumps bacteria hypertension
##  0:353           0:324    0:358           0:378    0:253       
##  1: 47           1: 76    1: 42           1: 22    1:147       
##  diabetes_mellitus coronary_artery_disease appetite pedal_edema anemia  class  
##  0:263             0:366                   0:318    0:324       0:340   0:150  
##  1:137             1: 34                   1: 82    1: 76       1: 60   1:250
# Combine data_numeric and data_dummy_subset
data_combined <- cbind(data_numeric, data_dummy_subset)

MODEL BUILDING

# Define independent (features) and dependent (labels) variables
ind_col <- setdiff(names(data_combined), "class")
dep_col <- "class"

X <- data_combined[, ind_col]
y <- data_combined[, dep_col]

# Split the data into training and testing sets
set.seed(0) # Set a random seed for reproducibility
train_index <- createDataPartition(y, p = 0.7, list = FALSE)
X_train <- X[train_index, ]
y_train <- y[train_index]
X_test <- X[-train_index, ]
y_test <- y[-train_index]

Train the KNN model

# Train the KNN model
knn_model <- knn(X_train, X_test, y_train, k = 5)
knn_acc <- mean(knn_model == y_test)

# Print KNN model results
cat("Test Accuracy of KNN is:", knn_acc, "\n")
## Test Accuracy of KNN is: 0.7166667

DECISION TREE

# Train the decision tree model
dtc_model <- rpart(y_train ~ ., data = X_train)
dtc_pred <- predict(dtc_model, X_test, type = "class")
dtc_acc <- mean(dtc_pred == y_test)

# Print decision tree model results
cat("Test Accuracy of Decision Tree Classifier is:", dtc_acc, "\n")
## Test Accuracy of Decision Tree Classifier is: 0.925

RANDOM FOREST CLASSIFIER

# Train the random forest model
rf_model <- randomForest(X_train, as.factor(y_train))
rf_pred <- predict(rf_model, X_test)
rf_acc <- mean(rf_pred == y_test)

# Print random forest model results
cat("Test Accuracy of Random Forest Classifier is:", rf_acc, "\n")
## Test Accuracy of Random Forest Classifier is: 1
# Create a data frame for models
models <- data.frame(
  Model = c('KNN', 'Decision Tree Classifier', 'Random Forest Classifier'),
  Score = c(knn_acc, dtc_acc, rf_acc)
)

# Sort the data frame by Score in descending order
models <- models[order(models$Score, decreasing = TRUE), ]

# Print the sorted data frame
print(models)
##                      Model     Score
## 3 Random Forest Classifier 1.0000000
## 2 Decision Tree Classifier 0.9250000
## 1                      KNN 0.7166667
# Create a bar plot
plot_ly(
  data = models,
  x = ~Score,
  y = ~Model,
  color = ~Score,
  type = "bar",
  orientation = "h",
  colors = viridis(length(unique(models$Score))),  # Use viridis color palette
  text = ~paste("Score: ", round(Score, 3)),
  layout = list(
    title = "Models Comparison",
    xaxis = list(title = "Score"),
    yaxis = list(title = "Model"),
    template = "plotly_dark"
  )
)